data = read.csv('AB_NYC_2019.csv', na.strings = c("", "NA"))
dat=data
gg_miss_upset(dat)
nrow(dat)
## [1] 48895
availability cannot be used as a measurement for popularity; what does it mean? -> use # of reviewsdat=data%>%mutate(avail_0=if_else(availability_365==0,TRUE,FALSE))
dat=dat%>%mutate(term=if_else(minimum_nights<=7,"short",if_else(minimum_nights<=45,"middle","long")))
#dat%>%filter(minimum_nights>3,availability_365==0,number_of_reviews==0)
ggplot(data=dat,aes(x=log(price),fill=avail_0))+geom_density(alpha=0.5)
## Warning: Removed 11 rows containing non-finite values (stat_density).
ggplot(data=dat,aes(x=log(1+number_of_reviews),fill=avail_0))+geom_density(alpha=0.5)
#ids=dat%>%filter(price==0)%>%pull(host_id)
#dat%>%filter(host_id%in%ids)
quantile(dat$minimum_nights,0.99)
## 99%
## 45
mean(dat$minimum_nights<=30)
## [1] 0.9847224
dat%>%ggplot(data=.,aes(x=log(price),fill=term))+geom_density(alpha=0.5)
## Warning: Removed 11 rows containing non-finite values (stat_density).
dat%>%ggplot(data=.,aes(x=term,y=log(1+number_of_reviews)))+geom_boxplot(alpha=0.5)
dat%>%ggplot(data=.,aes(x=term,y=log(price)))+geom_boxplot(alpha=0.5)
## Warning: Removed 11 rows containing non-finite values (stat_boxplot).
order_nei=dat%>%group_by(neighbourhood_group,neighbourhood)%>%summarise()%>%pull(neighbourhood)%>%as.character()
dat=dat%>%mutate(neighbourhood=factor(neighbourhood,levels = order_nei))
dat%>%ggplot(data=.,aes(x=neighbourhood,y=log(price)))+geom_boxplot()
## Warning: Removed 11 rows containing non-finite values (stat_boxplot).
dat%>%ggplot(data=.,aes(x=neighbourhood_group,y=log(price)))+geom_boxplot()
## Warning: Removed 11 rows containing non-finite values (stat_boxplot).
dat%>%ggplot(data=.,aes(x=neighbourhood,fill=neighbourhood_group))+geom_bar()
dat%>%ggplot(data=.,aes(x=neighbourhood_group,fill=neighbourhood_group))+geom_bar()
dat%>%group_by(neighbourhood_group,neighbourhood)%>%summarise(count=n())%>%arrange(desc(count))%>%head(5)
## # A tibble: 5 x 3
## # Groups: neighbourhood_group [2]
## neighbourhood_group neighbourhood count
## <fct> <fct> <int>
## 1 Brooklyn Williamsburg 3920
## 2 Brooklyn Bedford-Stuyvesant 3714
## 3 Manhattan Harlem 2658
## 4 Brooklyn Bushwick 2465
## 5 Manhattan Upper West Side 1971
dat%>%ggplot(data=.,aes(x=room_type,y=log(price)))+geom_boxplot()
## Warning: Removed 11 rows containing non-finite values (stat_boxplot).
dat%>%ggplot(data=.)+geom_mosaic(aes(x=product(room_type,neighbourhood_group),fill=room_type))
library("ggmap")
ny.map=get_map(location = c(left=-74.2445,right=-73.71298, bottom= 40.49975,top=40.9131),color = "bw",maptype = "toner",source = "stamen")
ggmap(ny.map)+
stat_density2d(data = dat,
aes(x = longitude, y = latitude,fill = ..level.., alpha = ..level..),
geom = "polygon") +
scale_fill_gradient(low = "green", high = "red") +
scale_alpha(range = c(0, 0.75), guide = FALSE)
img = readJPEG("New_York_City_.jpg")
jet.colors <- colorRampPalette(c("#00007F", "blue", "#007FFF", "cyan", "#7FFF7F", "yellow", "#FF7F00", "red", "#7F0000"))
ggplot(dat, aes(x=longitude, y = latitude, color = log(1+price)))+
annotation_custom(rasterGrob(img,
width = unit(1,"npc"),
height = unit(1,"npc")),
-74.258, -73.69, 40.49,40.92) +
geom_point(cex = 0.4,alpha=0.5) +
scale_colour_gradientn(colors = jet.colors(7), limits = c(3,7))
ggplot(dat, aes(x=longitude, y = latitude, color = availability_365))+
annotation_custom(rasterGrob(img,
width = unit(1,"npc"),
height = unit(1,"npc")),
-74.258, -73.69, 40.49,40.92) +
geom_point(cex = 0.4,alpha=0.5) +
scale_colour_gradient(low = 'red', high = 'grey')
library(tidytext)
library("textdata")
words=dat$name%>%
str_to_lower()%>%
str_replace_all(.,"\\+|&|@|\\/|!|;|,"," ")%>%
str_replace_all(.,"by|the|of|in|on|to","")%>%str_split(.," ")
words=map(words,~.x[.x!=""])
word_count=map_dbl(words,~length(.x))
dat=dat%>%mutate(wcount=word_count)
ggplot(data=dat,mapping = aes(x=word_count,y=log(price)))+geom_point()+geom_smooth()
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 11 rows containing non-finite values (stat_smooth).
ggplot(data=dat,mapping = aes(x=word_count,y=log(1+number_of_reviews)))+geom_point()+geom_smooth()
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
library(wordcloud)
## Loading required package: RColorBrewer
names=dat$name%>%str_to_lower()%>%word()%>%str_replace_all(.,"\\+|@|\\/|!|;|,|\\*|\\(|\\)|:|-|_|¡|\\.|\\'|‘|’|\\'|\"|“|”|a|the","")
all_words=names[!names%in%stop_words]%>%table()
wordcloud(names(all_words),all_words,max.words = 100)
references:
http://www2.stat.duke.edu/~cr173/Sta444_Fa18/
https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3074178/pdf/nihms237255.pdf
https://www.google.com/search?q=spatial+prior+hierachical+model+r+package&oq=spatial+prior+hierachical+model+r+package&aqs=chrome..69i57.16517j0j7&sourceid=chrome&ie=UTF-8
pkg: HSAR, spBayes, CARBayes